In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
In [2]:
df = pd.read_csv(r"D:\Stuff\Data Science\Machine Learning\bank-additional-full.csv" , na_values='unknown' , sep=';')
print(df.shape)
df.head()
(41188, 21)
Out[2]:
age job marital education default housing loan contact month day_of_week ... campaign pdays previous poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed y
0 56 housemaid married basic.4y no no no telephone may mon ... 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 no
1 57 services married high.school NaN no no telephone may mon ... 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 no
2 37 services married high.school no yes no telephone may mon ... 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 no
3 40 admin. married basic.6y no no no telephone may mon ... 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 no
4 56 services married high.school no no yes telephone may mon ... 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0 no

5 rows × 21 columns

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             40858 non-null  object 
 2   marital         41108 non-null  object 
 3   education       39457 non-null  object 
 4   default         32591 non-null  object 
 5   housing         40198 non-null  object 
 6   loan            40198 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null  float64
 18  euribor3m       41188 non-null  float64
 19  nr.employed     41188 non-null  float64
 20  y               41188 non-null  object 
dtypes: float64(5), int64(5), object(11)
memory usage: 6.6+ MB
In [4]:
df.isna().sum()
Out[4]:
age                  0
job                330
marital             80
education         1731
default           8597
housing            990
loan               990
contact              0
month                0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64
In [5]:
na_cols = df.columns[df.isnull().any()]
na_dtypes = df[na_cols].dtypes
print(na_dtypes)
job          object
marital      object
education    object
default      object
housing      object
loan         object
dtype: object
In [6]:
df.describe().round().T
Out[6]:
count mean std min 25% 50% 75% max
age 41188.0 40.0 10.0 17.0 32.0 38.0 47.0 98.0
duration 41188.0 258.0 259.0 0.0 102.0 180.0 319.0 4918.0
campaign 41188.0 3.0 3.0 1.0 1.0 2.0 3.0 56.0
pdays 41188.0 962.0 187.0 0.0 999.0 999.0 999.0 999.0
previous 41188.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0
emp.var.rate 41188.0 0.0 2.0 -3.0 -2.0 1.0 1.0 1.0
cons.price.idx 41188.0 94.0 1.0 92.0 93.0 94.0 94.0 95.0
cons.conf.idx 41188.0 -41.0 5.0 -51.0 -43.0 -42.0 -36.0 -27.0
euribor3m 41188.0 4.0 2.0 1.0 1.0 5.0 5.0 5.0
nr.employed 41188.0 5167.0 72.0 4964.0 5099.0 5191.0 5228.0 5228.0
In [7]:
# List of columns having missing values
na_cols_list = na_cols.tolist()

# Loop through each column and fill missing values with the mode of the column
for col in na_cols_list:
    df[col] = df[col].fillna(df[col].mode().values[0])

# Check if there are any remaining missing values
print(df.isna().sum())
age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64
In [8]:
plt.figure(figsize=(13,3))
sns.set_style("darkgrid")
plt.title("Detected Outliers")
sns.boxplot(x='age', data=df)
plt.show()
No description has been provided for this image
In [9]:
q1 = df['age'].quantile(0.25)
q3 = df['age'].quantile(0.75)
iqr = q3-q1
df = df[(df['age'] > q1 -1.5 * iqr ) & (df['age'] < q3 + 1.5 * iqr )]
In [10]:
fig = px.box(df, x='duration')
fig.update_traces(marker=dict(color='#FF851B'))
fig.update_layout(autosize=False, width=1100, height= 400, title='Detected Outliers')
fig.show()
In [11]:
Q1 = df['duration'].quantile(0.25)
Q3 = df['duration'].quantile(0.75)
IQR = Q3 - Q1

upper_bound = Q3 + 1.5 * IQR

df = df[(df['duration']>Q1 - 1.5 * IQR) & (df['duration']<Q3 + 1.5 * IQR)] 
df = df[df['duration']<490]
In [12]:
fig = px.box(df, x='duration')
fig.update_traces(marker=dict(color='#FF851B'))
fig.update_layout(autosize=False, width=1100, height=400, title='Detected Outliers')
fig.show()
In [13]:
def pie_plot(ax, col, df, title="Pie Chart"):
    value_counts = df[col].value_counts()
    values = list(value_counts.values)
    labels = value_counts.index
    colors = sns.color_palette('pastel')[:len(labels)]

    ax.pie(values, labels=labels, autopct='%.1f%%', colors=colors,startangle=140)
    ax.set_title(title)

columns = ['job', 'education', 'loan', 'month', 'day_of_week', 'contact']
titles = ['Distribution of Jobs', 'Distribution of Education', 'Distribution of Loan',
          'Distribution of Month', 'Distribution of Day of Week', 'Distribution of Contact']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, (col, title) in enumerate(zip(columns, titles)):
    pie_plot(axes[i], col, df, title=title)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [14]:
numeric_columns = df.select_dtypes(include=['int64', 'float64'])
numeric_columns_list = numeric_columns.columns.tolist()

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, (col, title) in enumerate(zip(numeric_columns_list, titles)):
    sns.histplot(df[col], kde=True, bins=10, ax=axes[i])
    axes[i].set_title(f"{col} distribution" )
    axes[i].tick_params(axis='x', rotation=45)
    if df[col].dtype in ['float64', 'int64']:
        mean_value = df[col].mean()
        axes[i].axvline(mean_value, ls='--', color='red')

        # Get the maximum y limit of the plot
        max_y = axes[i].get_ylim()[1]  
        
        # Text placement with dynamic X and Y
        axes[i].text(mean_value, max_y * 0.85, f'<-- Mean: {mean_value:.2f}', color='black')
    
plt.tight_layout()
plt.show()
No description has been provided for this image
In [15]:
plt.figure(figsize=(12,4))
ax = sns.countplot(data=df,x='job',hue='job',order = df['job'].value_counts().index, palette='viridis') 
plt.xticks(rotation = 45)

for p in ax.patches:
    ax.annotate(f'{p.get_height()}',
                (p.get_x() + p.get_width() / 2.,p.get_height()),
                ha = 'center',va='center',
                fontsize=10,color='black',
                xytext=(0,5),textcoords='offset points')
    
plt.title("Job Distribution")
plt.show()
No description has been provided for this image
In [16]:
plt.figure(figsize=(5,3))
ax = sns.countplot(x='y', data=df,hue='y', order=df['y'].value_counts().index, palette='viridis')
plt.xticks(rotation=45)

for p in ax.patches:
    ax.annotate(f'{p.get_height()}',
                (p.get_x() + p.get_width() / 2.,p.get_height()),
                ha = 'center',va='center',
                fontsize=10,color='black',
                xytext=(0,5),textcoords='offset points')
    
plt.title("Distribution of Y")
plt.show()
No description has been provided for this image
In [17]:
df1= df.copy()
df1['y'] = le.fit_transform(df1['y'])
In [18]:
df1.dtypes
Out[18]:
age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                   int32
dtype: object
In [19]:
categorical_variables = df1.select_dtypes(include='object').columns.tolist()
for column in categorical_variables:
    df1[column] = le.fit_transform(df1[column])
df1.dtypes
Out[19]:
age                 int64
job                 int32
marital             int32
education           int32
default             int32
housing             int32
loan                int32
contact             int32
month               int32
day_of_week         int32
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome            int32
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                   int32
dtype: object
In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df1.drop('y', axis=1)  
y = df1['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

model = Sequential()
model.add(Input(shape=(X_train.shape[1],)) )
model.add(Dense(units=16, activation='relu'))
model.add(Dense(units=8, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))
In [22]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
In [23]:
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.2)
Epoch 1/10
714/714 ━━━━━━━━━━━━━━━━━━━━ 10s 8ms/step - accuracy: 0.8071 - loss: 0.3860 - val_accuracy: 0.9439 - val_loss: 0.1455
Epoch 2/10
714/714 ━━━━━━━━━━━━━━━━━━━━ 5s 7ms/step - accuracy: 0.9483 - loss: 0.1351 - val_accuracy: 0.9476 - val_loss: 0.1403
Epoch 3/10
714/714 ━━━━━━━━━━━━━━━━━━━━ 5s 8ms/step - accuracy: 0.9486 - loss: 0.1335 - val_accuracy: 0.9494 - val_loss: 0.1374
Epoch 4/10
714/714 ━━━━━━━━━━━━━━━━━━━━ 5s 7ms/step - accuracy: 0.9517 - loss: 0.1260 - val_accuracy: 0.9478 - val_loss: 0.1367
Epoch 5/10
714/714 ━━━━━━━━━━━━━━━━━━━━ 5s 7ms/step - accuracy: 0.9524 - loss: 0.1234 - val_accuracy: 0.9502 - val_loss: 0.1345
Epoch 6/10
714/714 ━━━━━━━━━━━━━━━━━━━━ 4s 6ms/step - accuracy: 0.9544 - loss: 0.1150 - val_accuracy: 0.9501 - val_loss: 0.1330
Epoch 7/10
714/714 ━━━━━━━━━━━━━━━━━━━━ 5s 7ms/step - accuracy: 0.9516 - loss: 0.1185 - val_accuracy: 0.9494 - val_loss: 0.1324
Epoch 8/10
714/714 ━━━━━━━━━━━━━━━━━━━━ 5s 6ms/step - accuracy: 0.9509 - loss: 0.1222 - val_accuracy: 0.9504 - val_loss: 0.1321
Epoch 9/10
714/714 ━━━━━━━━━━━━━━━━━━━━ 6s 8ms/step - accuracy: 0.9536 - loss: 0.1138 - val_accuracy: 0.9502 - val_loss: 0.1300
Epoch 10/10
714/714 ━━━━━━━━━━━━━━━━━━━━ 6s 9ms/step - accuracy: 0.9518 - loss: 0.1181 - val_accuracy: 0.9508 - val_loss: 0.1295
In [24]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy * 100:.2f} %")
223/223 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - accuracy: 0.9467 - loss: 0.1372
Test Loss: 0.1230, Test Accuracy: 94.98 %
In [25]:
plt.figure(figsize=(12,5))
plt.plot(history.history['accuracy'], label='Train Data Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Data Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()
No description has been provided for this image